library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggplot2)
1. Read in the data
## data extracted from New York Times state-level data from NYT Github repository
# https://github.com/nytimes/covid-19-data
## state-level population information from us_census_data available on GitHub repository:
# https://github.com/COVID19Tracking/associated-data/tree/master/us_census_data
### FINISH THE CODE HERE ###
# load COVID state-level data from NYT
cv_states <- as.data.frame(read.csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"))
### FINISH THE CODE HERE ###
# load state population data
state_pops <- as.data.frame(read.csv("https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv"))
state_pops$abb <- state_pops$state
state_pops$state <- state_pops$state_name
state_pops$state_name <- NULL
### FINISH THE CODE HERE
cv_states <- merge(cv_states, state_pops, by="state")
2. Look at the data
dim(cv_states)
## [1] 32250 9
head(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 1 Alabama 2020-06-10 1 21989 744 1 4887871 96.50939 AL
## 2 Alabama 2021-10-31 1 832047 15573 1 4887871 96.50939 AL
## 3 Alabama 2021-05-26 1 542831 11138 1 4887871 96.50939 AL
## 4 Alabama 2020-04-19 1 4903 160 1 4887871 96.50939 AL
## 5 Alabama 2021-07-07 1 552911 11387 1 4887871 96.50939 AL
## 6 Alabama 2020-06-21 1 30021 839 1 4887871 96.50939 AL
tail(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 32245 Wyoming 2020-12-01 56 33805 230 56 577737 5.950611 WY
## 32246 Wyoming 2021-08-15 56 68272 793 56 577737 5.950611 WY
## 32247 Wyoming 2021-03-16 56 55352 693 56 577737 5.950611 WY
## 32248 Wyoming 2021-04-12 56 56988 701 56 577737 5.950611 WY
## 32249 Wyoming 2021-04-01 56 56389 700 56 577737 5.950611 WY
## 32250 Wyoming 2020-11-30 56 33305 215 56 577737 5.950611 WY
str(cv_states)
## 'data.frame': 32250 obs. of 9 variables:
## $ state : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ date : chr "2020-06-10" "2021-10-31" "2021-05-26" "2020-04-19" ...
## $ fips : int 1 1 1 1 1 1 1 1 1 1 ...
## $ cases : int 21989 832047 542831 4903 552911 30021 117242 809485 134417 547135 ...
## $ deaths : int 744 15573 11138 160 11387 839 2037 14869 2285 11252 ...
## $ geo_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ population : int 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
## $ pop_density: num 96.5 96.5 96.5 96.5 96.5 ...
## $ abb : chr "AL" "AL" "AL" "AL" ...
3. Format the data
# format the date
cv_states$date <- as.Date(cv_states$date, format="%Y-%m-%d")
# format the state and state abbreviation (abb) variables
state_list <- unique(cv_states$state)
cv_states$state <- factor(cv_states$state, levels = state_list)
abb_list <- unique(cv_states$abb)
cv_states$abb <- factor(cv_states$abb, levels = abb_list)
### FINISH THE CODE HERE
# order the data first by state, second by date
cv_states = cv_states[order(cv_states$state, cv_states$date),]
# Confirm the variables are now correctly formatted
str(cv_states)
## 'data.frame': 32250 obs. of 9 variables:
## $ state : Factor w/ 52 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ date : Date, format: "2020-03-13" "2020-03-14" ...
## $ fips : int 1 1 1 1 1 1 1 1 1 1 ...
## $ cases : int 6 12 23 29 39 51 78 106 131 157 ...
## $ deaths : int 0 0 0 0 0 0 0 0 0 0 ...
## $ geo_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ population : int 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
## $ pop_density: num 96.5 96.5 96.5 96.5 96.5 ...
## $ abb : Factor w/ 52 levels "AL","AK","AZ",..: 1 1 1 1 1 1 1 1 1 1 ...
head(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 215 Alabama 2020-03-13 1 6 0 1 4887871 96.50939 AL
## 428 Alabama 2020-03-14 1 12 0 1 4887871 96.50939 AL
## 69 Alabama 2020-03-15 1 23 0 1 4887871 96.50939 AL
## 496 Alabama 2020-03-16 1 29 0 1 4887871 96.50939 AL
## 341 Alabama 2020-03-17 1 39 0 1 4887871 96.50939 AL
## 53 Alabama 2020-03-18 1 51 0 1 4887871 96.50939 AL
tail(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 31735 Wyoming 2021-11-06 56 105318 1243 56 577737 5.950611 WY
## 31791 Wyoming 2021-11-07 56 105318 1243 56 577737 5.950611 WY
## 31967 Wyoming 2021-11-08 56 105990 1243 56 577737 5.950611 WY
## 31942 Wyoming 2021-11-09 56 106287 1298 56 577737 5.950611 WY
## 31961 Wyoming 2021-11-10 56 106698 1298 56 577737 5.950611 WY
## 32016 Wyoming 2021-11-11 56 106698 1298 56 577737 5.950611 WY
# Inspect the range values for each variable. What is the date range? The range of cases and deaths?
head(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 215 Alabama 2020-03-13 1 6 0 1 4887871 96.50939 AL
## 428 Alabama 2020-03-14 1 12 0 1 4887871 96.50939 AL
## 69 Alabama 2020-03-15 1 23 0 1 4887871 96.50939 AL
## 496 Alabama 2020-03-16 1 29 0 1 4887871 96.50939 AL
## 341 Alabama 2020-03-17 1 39 0 1 4887871 96.50939 AL
## 53 Alabama 2020-03-18 1 51 0 1 4887871 96.50939 AL
summary(cv_states)
## state date fips cases
## Washington : 661 Min. :2020-01-21 Min. : 1.00 Min. : 1
## Illinois : 658 1st Qu.:2020-08-03 1st Qu.:16.00 1st Qu.: 31774
## California : 657 Median :2021-01-05 Median :29.00 Median : 146459
## Arizona : 656 Mean :2021-01-05 Mean :29.78 Mean : 386609
## Massachusetts: 650 3rd Qu.:2021-06-09 3rd Qu.:44.00 3rd Qu.: 481786
## Wisconsin : 646 Max. :2021-11-11 Max. :72.00 Max. :4993930
## (Other) :28322
## deaths geo_id population pop_density
## Min. : 0 Min. : 1.00 Min. : 577737 Min. : 1.292
## 1st Qu.: 621 1st Qu.:16.00 1st Qu.: 1805832 1st Qu.: 43.659
## Median : 2658 Median :29.00 Median : 4468402 Median : 107.860
## Mean : 7155 Mean :29.78 Mean : 6433897 Mean : 422.513
## 3rd Qu.: 8432 3rd Qu.:44.00 3rd Qu.: 7535591 3rd Qu.: 229.511
## Max. :73132 Max. :72.00 Max. :39557045 Max. :11490.120
## NA's :609
## abb
## WA : 661
## IL : 658
## CA : 657
## AZ : 656
## MA : 650
## WI : 646
## (Other):28322
min(cv_states$date)
## [1] "2020-01-21"
max(cv_states$date)
## [1] "2021-11-11"
4. Add new_cases and new_deaths and correct outliers
# Add variables for new_cases and new_deaths:
for (i in 1:length(state_list)) {
cv_subset = subset(cv_states, state == state_list[i])
cv_subset = cv_subset[order(cv_subset$date),]
# add starting level for new cases and deaths
cv_subset$new_cases = cv_subset$cases[1]
cv_subset$new_deaths = cv_subset$deaths[1]
### FINISH THE CODE HERE
for (j in 2:nrow(cv_subset)) {
cv_subset$new_cases[j] = cv_subset$cases[j] - cv_subset$cases[j-1]
cv_subset$new_deaths[j] = cv_subset$deaths[j] - cv_subset$deaths[j-1]
}
# include in main dataset
cv_states$new_cases[cv_states$state==state_list[i]] = cv_subset$new_cases
cv_states$new_deaths[cv_states$state==state_list[i]] = cv_subset$new_deaths
}
# Focus on recent dates
cv_states <- cv_states %>%
dplyr::filter(date >= "2021-06-01")
### FINISH THE CODE HERE
# Inspect outliers in new_cases using plotly
p1<-ggplot(cv_states, aes(x = date, y = new_cases, color = state)) +
geom_line() +
geom_point(size = .5, alpha = 0.5)
ggplotly(p1)
p1<-NULL # to clear from workspace
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) +
geom_line() +
geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
p2<-NULL # to clear from workspace
# set negative new case or death counts to 0
cv_states$new_cases[cv_states$new_cases<0] = 0
cv_states$new_deaths[cv_states$new_deaths<0] = 0
# Recalculate `cases` and `deaths` as cumulative sum of updated `new_cases` and `new_deaths`
for (i in 1:length(state_list)) {
cv_subset = subset(cv_states, state == state_list[i])
# add starting level for new cases and deaths
cv_subset$cases = cv_subset$cases[1]
cv_subset$deaths = cv_subset$deaths[1]
### FINISH CODE HERE
for (j in 2:nrow(cv_subset)) {
cv_subset$cases[j] = cv_subset$new_cases[j] + cv_subset$cases[j-1]
cv_subset$deaths[j] = cv_subset$new_deaths[j] + cv_subset$deaths[j-1]
}
# include in main dataset
cv_states$cases[cv_states$state==state_list[i]] = cv_subset$cases
cv_states$deaths[cv_states$state==state_list[i]] = cv_subset$deaths
}
# Smooth new counts
cv_states$new_cases = zoo::rollmean(cv_states$new_cases, k=7, fill=NA, align='right') %>% round(digits = 0)
cv_states$new_deaths = zoo::rollmean(cv_states$new_deaths, k=7, fill=NA, align='right') %>% round(digits = 0)
# Inspect data again interactively
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
#p2=NULL
5. Add additional variables
### FINISH CODE HERE
# add population normalized (by 100,000) counts for each variable
cv_states$per100k = as.numeric(format(round(cv_states$cases/(cv_states$population/100000),1),nsmall=1))
cv_states$newper100k = as.numeric(format(round(cv_states$new_cases/(cv_states$population/100000),1),nsmall=1))
## Warning: 强制改变过程中产生了NA
cv_states$deathsper100k = as.numeric(format(round(cv_states$deaths/(cv_states$population/100000),1),nsmall=1))
cv_states$newdeathsper100k = as.numeric(format(round(cv_states$new_deaths/(cv_states$population/100000),1),nsmall=1))
## Warning: 强制改变过程中产生了NA
# add a naive_CFR variable = deaths / cases
cv_states = cv_states %>% mutate(naive_CFR = round((deaths*100/cases),2))
# create a `cv_states_today` variable
cv_states_today = subset(cv_states, date==max(cv_states$date))
6. Explore scatterplots using plot_ly()
### FINISH CODE HERE
# pop_density vs. cases
cv_states_today %>%
plot_ly(x = ~pop_density, y = ~cases,
type = "scatter", mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
## Warning: Ignoring 1 observations
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
# filter out "District of Columbia"
cv_states_today_filter <- cv_states_today %>% filter(state!="District of Columbia")
# pop_density vs. cases after filtering
cv_states_today_filter %>%
plot_ly(x = ~pop_density, y = ~cases,
type = "scatter", mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
## Warning: Ignoring 1 observations
## Warning: n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning: n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
# pop_density vs. deathsper100k
cv_states_today_filter %>%
plot_ly(x = ~pop_density, y = ~deathsper100k,
type = "scatter", mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
## Warning: Ignoring 1 observations
## Warning: n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning: n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
# Adding hoverinfo
cv_states_today_filter %>%
plot_ly(x = ~pop_density, y = ~deathsper100k,
type = "scatter", mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5),
hoverinfo = 'text',
text = ~paste( paste(state, ":", sep=""), paste(" Cases per 100k: ", per100k, sep="") ,
paste(" Deaths per 100k: ", deathsper100k, sep=""), sep = "<br>")) %>%
layout(title = "Population-normalized COVID-19 deaths (per 100k) vs. population density for US states",
yaxis = list(title = "Deaths per 100k"), xaxis = list(title = "Population Density"),
hovermode = "compare")
## Warning: Ignoring 1 observations
## Warning: n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning: n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
7. Explore scatterplot trend interactively using ggplotly() and geom_smooth()
p <- ggplot(cv_states_today_filter, aes(x=pop_density, y=deathsper100k, size=population)) +
geom_point() +
geom_smooth()
ggplotly(p)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
8. Multiple line chart
# Line chart for naive_CFR for all states over time using `plot_ly()`
plot_ly(cv_states, x = ~date, y = ~naive_CFR, color = ~state, type = "scatter", mode = "lines")
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
### FINISH CODE HERE
# Line chart for Florida showing new_cases and new_deaths together
cv_states %>%
filter(state=="Florida") %>%
plot_ly(x = ~date, y = ~new_cases, type = "scatter", mode = "lines") %>%
add_lines(x = ~date, y = ~new_deaths, type = "scatter", mode = "lines")
9. Heatmaps
# Map state, date, and new_cases to a matrix
library(tidyr)
cv_states_mat <- cv_states %>% select(state, date, new_cases) %>% dplyr::filter(date>as.Date("2021-06-15"))
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = new_cases))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)
# Create a heatmap using plot_ly()
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
z=~cv_states_mat2,
type="heatmap",
showscale=T)
# Repeat with newper100k
cv_states_mat <- cv_states %>% select(state, date, newper100k) %>% dplyr::filter(date>as.Date("2021-06-15"))
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = newper100k))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
z=~cv_states_mat2,
type="heatmap",
showscale=T)
# Create a second heatmap after filtering to only include dates every other week
filter_dates <- seq(as.Date("2021-06-15"), as.Date("2021-11-01"), by="2 weeks")
cv_states_mat <- cv_states %>% select(state, date, newdeathsper100k) %>% filter(date %in% filter_dates)
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = newdeathsper100k))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)
# Create a heatmap using plot_ly()
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
z=~cv_states_mat2,
type="heatmap",
showscale=T)
10. Map
pick.date = "2021-10-15"
# Extract the data for each state by its abbreviation
cv_per100 <- cv_states %>% filter(date==pick.date) %>% select(state, abb, newper100k, cases, deaths) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL
# Create hover text
cv_per100$hover <- with(cv_per100, paste(state_name, '<br>', "Cases per 100k: ", newper100k, '<br>', "Cases: ", cases, '<br>', "Deaths: ", deaths))
# Set up mapping details
set_map_details <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
# Make sure both maps are on the same color scale
shadeLimit <- 125
# Create the map
fig <- plot_geo(cv_per100, locationmode = 'USA-states') %>%
add_trace(
z = ~newper100k, text = ~hover, locations = ~state,
color = ~newper100k, colors = 'Purples'
)
fig <- fig %>% colorbar(title = paste0("Cases per 100k: ", pick.date), limits = c(0,shadeLimit))
fig <- fig %>% layout(
title = paste('Cases per 100k by State as of ', pick.date, '<br>(Hover for value)'),
geo = set_map_details
)
fig_pick.date <- fig
#############
### Map for today's date
# Extract the data for each state by its abbreviation
cv_per100 <- cv_states_today %>% select(state, abb, newper100k, cases, deaths) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL
# Create hover text
cv_per100$hover <- with(cv_per100, paste(state_name, '<br>', "Cases per 100k: ", newper100k, '<br>', "Cases: ", cases, '<br>', "Deaths: ", deaths))
# Set up mapping details
set_map_details <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
# Create the map
fig <- plot_geo(cv_per100, locationmode = 'USA-states') %>%
add_trace(
z = ~newper100k, text = ~hover, locations = ~state,
color = ~newper100k, colors = 'Purples'
)
fig <- fig %>% colorbar(title = paste0("Cases per 100k: ", Sys.Date()), limits = c(0,shadeLimit))
fig <- fig %>% layout(
title = paste('Cases per 100k by State as of', Sys.Date(), '<br>(Hover for value)'),
geo = set_map_details
)
fig_Today <- fig
### Plot together
subplot(fig_pick.date, fig_Today, nrows = 2, margin = .05)